In [1]:
#Import Libraries
#matplotlib
import matplotlib.pyplot as plt
#data manipulation
import numpy as np
import pandas as pd
#altair 
import altair as alt
#seaborn
import seaborn as sns
#plotnine
from plotnine import *
#plotly 
import plotly.graph_objs as go
import plotly as py
import plotly.express as px
import plotly.io as pio
pio.templates.default = "seaborn"
#enable offline mode
py.offline.init_notebook_mode()
#extern imports 
import ipynb.fs.full.utils as utils
#import path
from pathlib import Path
#activate render mode for altair
alt.renderers.enable('default')
Out[1]:
RendererRegistry.enable('default')
In [2]:
#define current path and join folder data
CURR_PATH = Path.cwd().joinpath('data', 'v1_4')
filename = Path(CURR_PATH , 'students_data').with_suffix('.csv')
In [3]:
#import dataframe
students_df = pd.read_csv(filename, sep=';')
In [4]:
#select graduate students and drop duplicate values
graduate_students = students_df[(students_df.Student_Label=="Graduate")&(students_df.Program_ID==143)].drop_duplicates()
dropout_students = students_df[(students_df.Student_Label=="Dropout")&(students_df.Program_ID==143)].drop_duplicates()

#format graduation semester
graduate_students["Graduation_Semester"]= np.array(utils.formatSemester(graduate_students,"Graduation_Semester"))
graduate_students["Start_Semester"]= np.array(utils.formatSemester(graduate_students,"Start_Semester"))
#grouped_MI.rename(columns = {0:'Count'}, inplace=True)
graduate_students.Duration_Start_Graduation.unique()
#remove outlier values
graduate_students = graduate_students[(graduate_students.Duration_Start_Graduation != 1) & (graduate_students.Duration_Start_Graduation != 3)]

Matplotlib

In [5]:
fig, ax = plt.subplots(figsize=(10,5))
for c, df in graduate_students.groupby('Graduation_Grade'):
    ax.scatter(df['Duration_Start_Graduation'], df['Graduation_Semester'],label=c)
ax.legend()
ax.set_title('Anzahl Semester vs Abschlussnote')
ax.set_xlabel('Anzahl Semester')
ax.set_ylabel('Jahr')
Out[5]:
Text(0, 0.5, 'Jahr')

Seaborn

In [6]:
g = sns.FacetGrid(graduate_students, hue='Graduation_Grade',height=6, aspect=1.5
                  ).map(plt.scatter, 'Duration_Start_Graduation',
                        'Graduation_Semester'
                        ).add_legend().set(title='Anzahl Semester vs Abschlussnote'
        , xlabel='Anzahl Semester', ylabel='Jahr')

Plotnine

In [7]:
(ggplot(graduate_students) + 
    aes(x = 'Duration_Start_Graduation',y = 'Graduation_Semester', color = 'Graduation_Grade') +
    geom_point() + 
    ggtitle('Anzahl Semester vs Abschlussnote') +
    xlab('Anzahl Semester') +
    ylab('Jahr'))
Out[7]:
<ggplot: (8726827601914)>

Plotly

In [8]:
traces = []
graduate_students.sort_values("Graduation_Semester", inplace=True)
for g in graduate_students['Graduation_Grade'].unique():
    traces.append(
        go.Scatter(
            mode='markers',
            x=graduate_students.Duration_Start_Graduation[graduate_students['Graduation_Grade'] == g],
            y=graduate_students.Graduation_Semester[graduate_students['Graduation_Grade'] == g],
            name= g))
fig = go.Figure(
    layout=dict(
        width=700,
        title='Anzahl Semester vs Abschlussnote',
        xaxis={'title': 'Anzahl Semester'},
        yaxis={'title': 'Jahr'},
    ),
    data=traces
)
fig.show()

Altair

In [9]:
alt.Chart(graduate_students,
  title='Anzahl Semester vs Abschlussnote'
  ).mark_circle().encode(alt.X('Duration_Start_Graduation',
                         title='Anzahl Semester'
                         ), alt.Y('Graduation_Semester',
                         title='Jahr'),
                         color='Graduation_Grade')
Out[9]:

Abbildung 2.7

In [10]:
fig = px.box(graduate_students, y="Student_ID", x="Duration_Start_Graduation", color="Graduation_Grade",
            title="Abschlussnote vs Dauer des Studiums")
fig.update_layout(
    width=1000,
    height=500,
    xaxis= dict(
      title="Dauer des Studiums (Semester) ",
      nticks=20
    ),
    yaxis= dict(
      title="Studentenanzahl"      
    ),
    legend_title_text='Abschlussnote',
)
fig.show()

Abbildung 2.9:

In [11]:
fig = px.violin(dropout_students, y="Duration_Min_Max_Exam", box=True, points='all', color="Gender")
fig.update_layout(
    xaxis= dict(
    ),
    yaxis= dict(
      nticks=30,
      title="Semesteranzahl"      
    ),
    legend_title_text='Geschlecht'
)
                
fig.show()

Abbildung 2.15:

In [12]:
ds = dropout_students[['Gender', 'Duration_Min_Max_Exam']]
fig = px.parallel_categories(ds, 
                             color="Duration_Min_Max_Exam", 
                             title="Anzahl der Semester zum Abbruch", 
                             labels={"Duration_Min_Max_Exam": "Dauer","Gender": "Geschlecht" },
                             color_continuous_scale=px.colors.diverging.Tealrose)

fig.show()